| BA9 R Markdown |
R version-4.1.0 import datasets file> Import Dataset > From Excel > Import
finalmatrix.xls: count matrix x= erv-loci locations, y= Sample names TREM2MasterAnnotation.xlsx = load meta data
#load install packages
library(dplyr)
library(tidyverse)
library(readxl)
library(DESeq2)
library(tibble)
library(plotly)
library(ggplot2)
library(EnhancedVolcano)
library(sva)
library(DT)
#BA9
cts <- read_excel("~/Genomic Medicine/Project/msc_project/finalmatrix.xls")
#header = TRUE , sep="\t", fill = TRUE, na.strings = "(NA)")
colnames(cts) <-sub("\\.", "-", colnames(cts))
cts <- as.data.frame(cts)
rownames(cts) <-cts$`erv-id`
#remove all row and col with NA
cts <- cts[(1:789),]
#remove MCI "RDobson-117
#remove gyrus"RDobson-129", "RDobson-130", "RDobson-131", "RDobson-133"
#remove HC "RDobson-132
#remove failed sequencing "RDobson-98"
#64 remaining
#outliers , low quality , "RDobson-36" 'RDobson-52'
ctsBA9 <- cts %>% select('erv-id', "RDobson-97", "RDobson-22", "RDobson-94", "RDobson-42","RDobson-96", "RDobson-26", "RDobson-36", "RDobson-52", "RDobson-20", "RDobson-43", "RDobson-107", "RDobson-61", "RDobson-95", "RDobson-121", "RDobson-127", "RDobson-114", "RDobson-65", "RDobson-16", "RDobson-102", "RDobson-35", "RDobson-57", "RDobson-51", "RDobson-72", "RDobson-89", "RDobson-13", "RDobson-66", "RDobson-34", "RDobson-60", "RDobson-17", "RDobson-112", "RDobson-32", "RDobson-87", "RDobson-41", "RDobson-64", "RDobson-90", "RDobson-116", "RDobson-81", "RDobson-50", "RDobson-3", "RDobson-73", "RDobson-77", "RDobson-12", "RDobson-39", "RDobson-110", "RDobson-37", "RDobson-99", "RDobson-59", "RDobson-55", "RDobson-23", "RDobson-124", "RDobson-11", "RDobson-86", "RDobson-108", "RDobson-25", "RDobson-83", "RDobson-24", "RDobson-105", "RDobson-103", "RDobson-75", "RDobson-68", "RDobson-123", "RDobson-79", "RDobson-71")
#HC and EC
meta <- read_xlsx("~/Genomic Medicine/Project/TREM2MasterAnnotation.xlsx")
## New names:
## * `Subject id` -> `Subject id...1`
## * `Sample ID` -> `Sample ID...3`
## * `Subject id` -> `Subject id...16`
## * `RNA tube labels` -> `RNA tube labels...66`
## * `RNA tube labels` -> `RNA tube labels...67`
## * ...
metaBA9 <- select(meta, `Sample ID...73`, Diagnosis_1, Tissue , `Sex`, `Age (at death)`, `PostMortemDelay (hours)`, `RIN Score`, `No. of E4 alleles`, `Sequencing Pool`, `Braak and Braak stage (modified Braak Stage)`)
#select only BA9 tissue
metaBA9<-metaBA9[metaBA9$Tissue == 'BA9',]
#correct colnames into readable format
names(metaBA9)[1] <-"Sample"
names(metaBA9)[5] <-"Age_at_death"
names(metaBA9)[6] <-"PostMortem_Delay_hours"
names(metaBA9)[7] <-"RIN_Score"
names(metaBA9)[8] <-"Num_of_E4_alleles"
names(metaBA9)[9] <-"Seq_pool"
names(metaBA9)[10] <-"Braak_Stage"
#define Braak values
#metaBA9$`Braak_Stage` <- replace(metaBA9$`Braak_Stage`, metaBA9$`Braak_Stage` == "TBC", "0")
#metaBA9$`Braak_Stage` <- replace(metaBA9$`Braak_Stage`, metaBA9$`Braak_Stage` == "UnusualNoBraak", "0")
#metaBA9$`Braak_Stage` <- replace(metaBA9$`Braak_Stage`, metaBA9$`Braak_Stage` == "modified Braak BNE II", "II")
#metaBA9$`Braak_Stage` <- replace(metaBA9$`Braak_Stage`, metaBA9$`Braak_Stage` == "HP Tau stage I", "I")
#specify bin values
metaBA9$Num_of_E4_alleles <- as.factor(metaBA9$Num_of_E4_alleles)
#change NA values to 0
metaBA9[is.na(metaBA9)] <- 0
#place RIN value in quintiles bins
#metaBA9 = mutate(metaBA9, quantile_rank = ntile(metaBA9$RIN_Score,5))
#metaBA9$quantile_rank<- as.factor(metaBA9$quantile_rank)
#place PostMortem_Delay_hours in quintile bins
#metaBA9 = mutate(metaBA9, quantile_rank = ntile(metaBA9$PostMortem_Delay_hours,5))
#metaBA9$quantile_rank <- as.factor(metaBA9$quantile_rank)
#place Age_at_death in quintile bins
#metaBA9 = mutate(metaBA9, quantile_rank = ntile(metaBA9$Age_at_death,5))
#metaBA9$quantile_rank <- as.factor(metaBA9$quantile_rank)
metaBA9 <- as.data.frame(metaBA9)
#rm RDobson-98, RDobson-117
metaBA9 <- metaBA9[-c(7,10),]
#8,9
#run deseq2
dds <- DESeqDataSetFromMatrix(countData = ctsBA9 ,
colData = metaBA9 ,
design =~Sex + Age_at_death + RIN_Score + PostMortem_Delay_hours + Num_of_E4_alleles + Seq_pool + Diagnosis_1 ,
tidy = TRUE)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
rld <- varianceStabilizingTransformation(dds)
rld_pca <- plotPCA(rld, intgroup=c("Diagnosis_1")) +
labs(title = "BA9- PCA plot 2 ") +
aes(label = colnames(rld))
rld_pca_plotly <- ggplotly(rld_pca)
rld_pca_plotly
#correct for known batch effects using SVA and ComBat
#"RDobson-36" 'RDobson-52' replaced to observe effect
batch <- metaBA9$Seq_pool
metaBA9$Seq_pool <- replace(metaBA9$Seq_pool, metaBA9$Seq_pool == "Pool 1", '1')
metaBA9$Seq_pool <- replace(metaBA9$Seq_pool, metaBA9$Seq_pool == "Pool 2", '2')
metaBA9$Seq_pool <- replace(metaBA9$Seq_pool, metaBA9$Seq_pool == "Pool 3", '3')
metaBA9$Seq_pool <- replace(metaBA9$Seq_pool, metaBA9$Seq_pool == "Pool 4", '4')
metaBA9$Seq_pool <- replace(metaBA9$Seq_pool, metaBA9$Seq_pool == "Pool 5", '5')
metaBA9$Seq_pool <- replace(metaBA9$Seq_pool, metaBA9$Seq_pool == "Pool 6", '6')
metaBA9$Seq_pool <- as.numeric(metaBA9$Seq_pool)
dat <- as.matrix(ctsBA9[2:ncol(ctsBA9)])
modcombat = model.matrix(~1, data =(as.data.frame(metaBA9$Num_of_E4_alleles)))
data_adjusted_BA9 <- ComBat_seq(dat, batch=batch, group = modcombat)
## Found 6 batches
## Using null model in ComBat-seq.
## Adjusting for 0 covariate(s) or covariate level(s)
## Estimating dispersions
## Fitting the GLM model
## Shrinkage off - using GLM estimates for parameters
## Adjusting the data
#plot PCA
pca_adjustedBA9 <-prcomp(data_adjusted_BA9)
pca_adjustedBA9<- summary(pca_adjustedBA9)
#select pca 1 and pca 2, make matrix
PCA1 <- pca_adjustedBA9$rotation[,1:2]
#add phenotype info
PCA1 <-cbind(PCA1,metaBA9[c(1,2,4,5,6,7,8,9)])
#shows %pc1
#head(pca_adjustedBA9)
#plot
ggplotly(ggplot(data = PCA1,
aes(x=PC1, y=PC2, col= Diagnosis_1))+
xlab("PC1: 0.91% variance") +
ylab("PC2: 0.03% variance") +
geom_point()+
ggtitle("BA9- SVA adjusted PCA"))
data_adjusted_BA9 <- as.matrix(data_adjusted_BA9)
dds <- DESeqDataSetFromMatrix(countData = data_adjusted_BA9 ,
colData = metaBA9 ,
design =~ ~Sex + Age_at_death + RIN_Score + PostMortem_Delay_hours + Num_of_E4_alleles + Seq_pool + Diagnosis_1 )
dds <- DESeq(dds)
res <- results(dds, contrast = c("Diagnosis_1","AD", "Control"))
res$ervid <- row.names(res)
#top5BA9 <-c('3182','3597','3397','3937','3203')
BA9_volcanco <- EnhancedVolcano(res, lab= rownames(res),
x ='log2FoldChange', y = 'padj',
#selectLab = c(top5BA9),
ylab = ~-Log[10]~adjusted~italic(P),
title = "DESeq2 results BA9",
subtitle = "Differential expression Volcano Plot",
legendPosition = "bottom",
xlim = c(-3,3),
ylim = c(0,5),
pCutoff = 0.05,
drawConnectors = TRUE)
#look at res table as a sorted data.frame
datatable(as.data.frame(res))
#write file as csv to export to excel
#write.csv(as.data.frame(res), file="BA9_ERV_loci_results.csv")
# subset values less than the p adjsut value <0.05
resSig <- subset(res, padj < 0.05)
#view top downregulated log fold change
head(resSig[ order(resSig$log2FoldChange), ])
## log2 fold change (MLE): Diagnosis_1 AD vs Control
## Wald test p-value: Diagnosis_1 AD vs Control
## DataFrame with 6 rows and 7 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## 3597 16.8585 -2.07632 0.412420 -5.03448 4.79154e-07 0.000115716
## 3937 304.2506 -1.42632 0.292508 -4.87618 1.08158e-06 0.000130600
## 3916 21.5422 -1.19943 0.355523 -3.37371 7.41623e-04 0.011193879
## 6125 18.3642 -1.15687 0.335708 -3.44607 5.68801e-04 0.009501209
## 3775 102.9523 -1.13890 0.259761 -4.38444 1.16286e-05 0.000702078
## 3316 1324.8878 -1.11382 0.313276 -3.55538 3.77426e-04 0.008039493
## ervid
## <character>
## 3597 3597
## 3937 3937
## 3916 3916
## 6125 6125
## 3775 3775
## 3316 3316
#view top upregulated log fold change
head(resSig[ order(resSig$log2FoldChange, decreasing = TRUE), ])
## log2 fold change (MLE): Diagnosis_1 AD vs Control
## Wald test p-value: Diagnosis_1 AD vs Control
## DataFrame with 6 rows and 7 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## 3743 125.7519 1.851115 0.395071 4.68553 2.79241e-06 0.000224789
## 5937 16.8099 1.571864 0.458043 3.43170 5.99821e-04 0.009657118
## 3505 22.7659 1.391711 0.392027 3.55004 3.85169e-04 0.008039493
## 3397 268.6015 1.095974 0.223821 4.89665 9.74866e-07 0.000130600
## 5980 27.7196 0.968430 0.324233 2.98683 2.81882e-03 0.028194791
## 3557 258.4234 0.951309 0.292603 3.25120 1.14920e-03 0.015001745
## ervid
## <character>
## 3743 3743
## 5937 5937
## 3505 3505
## 3397 3397
## 5980 5980
## 3557 3557
#top ERV with pvalue
topGene <- rownames(res)[which.min(res$padj)]
#plot control against AD for ERV loci 3397
ervcount <- counts(dds[c("3397")], normalized = TRUE)
m <- list(counts = as.numeric(ervcount), group = as.factor(dds$Diagnosis_1))
m <- as.tibble(m)
box3397 <- ggplot(m, aes(group, counts)) +
geom_boxplot(width = 0.3, outlier.shape = NA) +
geom_point(aes(color=group), position=position_jitter(width=0.1, height=0.1)) +
labs(x = "Alzhiemer's vs Control",
y = "Normalized Read Counts ",
title = "Normalized Count Expression of ERVmap loci 3397") +
theme(plot.title = element_text(face="bold", size=20))
box3397